Project Luther - Analysis

In [287]:
import numpy as np
import pandas as pd

import dateutil.parser
import datetime
from datetime import timedelta
import math
import calendar

import statsmodels.api as sm
import statsmodels
import seaborn as sns
from numpy import linalg
import patsy

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn import metrics

from sklearn import feature_selection as f_select

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

from pymongo import MongoClient

from statsmodels.formula.api import ols

import matplotlib.pyplot as plt

import yellowbrick as yb
from yellowbrick.regressor.residuals import ResidualsPlot

%matplotlib inline

Pull in scraped data

Data Import

The data from our scraping resides in a Mongo database.

Let's go get it...

In [315]:
client = MongoClient()
db = client.willco
In [316]:
df = pd.DataFrame(
    list(db.PropertyInfo.find())
)

Pulling from Mongo doesn't automatically set NaN the way read_csv does.

So, let's take care of that here.

In [317]:
df.replace('N/A',np.NaN,inplace=True)
df.shape
Out[317]:
(59570, 35)

Pulling from Mongo orders columns alphabetically (and adds an ID column). Reorganizing the data columns to be more in line with how it's presented on the website...

In [318]:
df = df[[
    'PIN', 'PropClass', 'Address', 'City', 'Zip', 'SaleDate', 'SaleAmt',
    'TaxRate', 'ASL', 'ASFL', 'AI', 'ASB', 'ASFB', 'ASTotal', 'ASFTotal',
    'Subdivision', 'FullBath', 'Style', 'HalfBath', 'LivingSqFt', 'CentralAir',
    'BldgSqFt', 'Fireplace', 'YearBuilt', 'Porch', 'Basement', 'Attic',
    'Garage', 'Lot', 'Block', 'Unit', 'Building', 'Area', 'Status'
]]
In [319]:
df.head()
Out[319]:
PIN PropClass Address City Zip SaleDate SaleAmt TaxRate ASL ASFL ... Porch Basement Attic Garage Lot Block Unit Building Area Status
0 04-10-01-101-001-0000 COMMERCIAL 2800 SCHWEITZER RD JOLIET 60421 NaN NaN 8.5817 (2016) 41,990 0 ... NO NONE NO NONE 1 NaN NaN NaN NaN NaN
1 05-06-01-101-001-0000 COMMERCIAL 1202 ESSINGTON RD JOLIET 60435 10/19/1987 $140,000 10.2841 (2016) 149,410 0 ... NO ??? NO ATTACHED NaN NaN NaN NaN NaN NaN
2 01-24-02-100-001-0000 FARMING 32801 S WEST RIVER RD WILMINGTON 60481 NaN NaN 6.1960 (2016) 6,331 47,103 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 01-24-03-100-003-0000 RESIDENTIAL S RT 53 WILMINGTON 60481 01/01/2016 $33,350 6.1960 (2016) 1,657 0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 01-24-03-100-004-0000 RESIDENTIAL 32849 S RT 53 WILMINGTON 60481 NaN NaN 6.1960 (2016) 6,602 0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 34 columns

In [320]:
df.shape
Out[320]:
(59570, 34)

Initial selections

We've scraped more data than we need. Let's pare it down...

Focus on Residential only

In [321]:
df.PropClass.value_counts()
Out[321]:
RESIDENTIAL    52794
FARMING         2102
OTHER           1985
COMMERCIAL      1730
INDUSTRIAL       959
Name: PropClass, dtype: int64
In [322]:
df = df[df['PropClass'] == 'RESIDENTIAL']

Remove rows with no target data

In [323]:
#df = df[df['SaleDate'].notnull()]
df.dropna(subset=['SaleDate'],inplace=True)
df.shape
Out[323]:
(43835, 34)

Massage the Data

The data is stored in the format or fashion as it existed on the website. We need a number of transformations in order to get the data where we can use it.

Encapsulate the changes in a function

In [324]:
df['Section'] = df['PIN'].apply(lambda x: x[:8])
In [325]:
gx=df.groupby('Section').count()['PIN']
gx[df.tail().Section]
gx['06-03-35']
Out[325]:
804
In [326]:
def masseuse(data):
    ''' Initial field transformations to facilitate further analysis
    Input: data - the dataframe
    Changes are made in place.
    '''
    
    data['Section'] = data['PIN'].apply(lambda x: x[:8])
    gx = data.groupby('Section').count()['PIN']
    data['Density'] = data['Section'].apply(lambda x: gx[x]).astype(float)
    
    data['SaleDate'] = data['SaleDate'].apply(lambda x: dateutil.parser.parse(x))

    data['Longevity'] = data['SaleDate'].apply(lambda x: dateutil.parser.parse('2018-01-01') - x)

    data['Longevity']=data['Longevity'].apply(lambda x: x.total_seconds()/86400.)

    data['SaleAmt']=data['SaleAmt'].replace('[\$,]', '', regex=True).astype(float)

    data['TaxRate']=data['TaxRate'].replace('\(\d+\)', '', regex=True).astype(float)

    data['ASTotal']=data['ASTotal'].replace('[,]', '', regex=True).astype(float)

    data['ASFTotal']=data['ASFTotal'].replace('[,]', '', regex=True).astype(float)

    data['LivingSqFt']=data['LivingSqFt'].replace(' Sq. Feet', '', regex=True).astype(float)

    data['BldgSqFt']=data['BldgSqFt'].replace(' Sq. Feet', '', regex=True).astype(float)

    data['YearBuilt'] = data['YearBuilt'].astype(float)
    data['FullBath'] = data['FullBath'].astype(float)
    data['HalfBath'] = data['HalfBath'].astype(float)
    
    data['CentralAir'].replace('YES',1.,inplace=True)
    data['CentralAir'].replace('NO',0.,inplace=True)
    data['CentralAir'] = data['CentralAir'].astype(float)

    data['Fireplace'].replace('YES',1.,inplace=True)
    data['Fireplace'].replace('NO',0.,inplace=True)
    data['Fireplace'] = data['Fireplace'].astype(float)

    data['Porch'].replace('YES',1.,inplace=True)
    data['Porch'].replace('NO',0.,inplace=True)
    data['Porch'] = data['Porch'].astype(float)

Implement the Changes

In [327]:
masseuse(df)

Prepare to Model

Select rows with data for all chosen features

For the most part, the records available from our data source either has complete information or a portion of the information is simply marked unavailable. Therefore, when this portion of data is missing for a feature, it's likely missing for the same set of features.

In [354]:
df2 = df[[
    'Longevity',
    'SaleAmt',
    'TaxRate',
    'ASTotal',
    'LivingSqFt',
    'YearBuilt',
    'FullBath',
    'HalfBath',
    'Fireplace',
    'CentralAir',
    'Porch',
    'Section',
    'Density'
]]
df2=df2[df2.notnull().all(axis=1)]
df2.shape
Out[354]:
(33338, 13)

Outlier Treatment

Earlier exploration with smaller datasets showed that a number of our features have significant outliers. Switching to log helped to capture the data better but there were still outliers.

Exploration

The primary features to treat here are:

  • SaleAmt
  • ASTotal
  • LivingSqFt

We could do the same for a few other fields. But there are a number of "sets" or groupings of these features. Of these, I've chosen one rather than continue with the colinearity issues. For example, I'm using LivingSqFt and ignoring BldgSqFt.

Preparation
In [355]:
df2 = df2[
    (df2['SaleAmt'] != 0) &
    (df2['LivingSqFt'] != 0) &
    (df2['ASTotal'] != 0)
]
df2.shape
Out[355]:
(30306, 13)
In [356]:
def explore_1(feature):
    fig=plt.figure(figsize=(15,4))
    ax = plt.subplot2grid((3, 2), (0, 0))
    bp = ax.boxplot(
        df2[feature],
        sym='+', vert=False, patch_artist=True)
    ax = plt.subplot2grid((3, 2), (0, 1))
    bp = ax.hist(
        df2[feature],
        bins=100
    )
    log_feature = np.log(df2[feature])
    ax = plt.subplot2grid((3, 2), (1, 0))
    bp = ax.boxplot(
        log_feature,
        sym='+', vert=False, patch_artist=True)
    ax = plt.subplot2grid((3, 2), (1, 1))
    bp = ax.hist(
        log_feature,
        bins=100
    )
    q75, q25 = np.percentile(log_feature, [75 ,25])
    iqr = q75 - q25
    lmin = q25 - (iqr*1.5)
    lmax = q75 + (iqr*1.5)
    log_trimmed = log_feature[log_feature > lmin]
    log_trimmed = log_trimmed[log_trimmed < lmax]
    ax = plt.subplot2grid((3, 2), (2, 0))
    bp = ax.boxplot(
        log_trimmed,
        sym='+', vert=False, patch_artist=True)
    ax = plt.subplot2grid((3, 2), (2, 1))
    bp = ax.hist(
        log_trimmed,
        bins=100
    )
    
Visualization
In [357]:
explore_1('Longevity')
In [358]:
explore_1('SaleAmt')
In [359]:
explore_1('ASTotal')
In [360]:
explore_1('LivingSqFt')

Decisions

We can use patsy to do log transformation. But let's do it here so we can further apply IQR based removal

In [363]:
for feature in ['SaleAmt','ASTotal','LivingSqFt','Longevity']:
    logf = 'Log_' + feature
    df2[logf] = np.log(df2[feature])
    q75, q25 = np.percentile(df2[logf], [75 ,25])
    iqr = q75 - q25
    lmin = q25 - (iqr*1.5)
    lmax = q75 + (iqr*1.5)
    df2 = df2[df2[logf]>lmin]
    df2 = df2[df2[logf]<lmax]
df2.shape
Out[363]:
(27300, 17)
In [161]:
# A quick cut here for prototyping...
df2 = df2[:1000]

Prepare the Model

Review Correlations

In [365]:
df2.corr().sort_values('Longevity')
Out[365]:
Longevity SaleAmt TaxRate ASTotal LivingSqFt YearBuilt FullBath HalfBath Fireplace CentralAir Porch Density Log_SaleAmt Log_ASTotal Log_LivingSqFt Log_Longevity
Log_SaleAmt -0.324790 0.956075 -0.202863 0.680784 0.586806 0.041859 0.054750 0.230793 0.299794 0.004714 0.139275 -0.133594 1.000000 0.701641 0.588082 -0.181865
SaleAmt -0.282381 1.000000 -0.178728 0.745069 0.640559 0.042522 0.075237 0.258146 0.319281 -0.042849 0.148568 -0.173099 0.956075 0.731251 0.618274 -0.155813
CentralAir -0.121918 -0.042849 0.167145 -0.161761 0.060385 -0.015568 -0.214047 -0.157721 -0.301008 1.000000 -0.309217 -0.085575 0.004714 -0.132254 0.112910 -0.084335
LivingSqFt -0.045015 0.640559 0.009063 0.773210 1.000000 0.033473 0.063043 0.268837 0.314672 0.060385 0.136709 -0.223614 0.586806 0.748974 0.977233 -0.001924
Log_LivingSqFt -0.040790 0.618274 0.029309 0.737280 0.977233 0.036480 0.054946 0.276854 0.301384 0.112910 0.135821 -0.207039 0.588082 0.734685 1.000000 0.002972
HalfBath -0.040120 0.258146 0.175852 0.296385 0.268837 0.022763 0.422643 1.000000 0.138804 -0.157721 0.291061 -0.025319 0.230793 0.270424 0.276854 -0.012412
TaxRate -0.037848 -0.178728 1.000000 -0.308856 0.009063 -0.029393 0.261750 0.175852 -0.080985 0.167145 0.043957 0.089073 -0.202863 -0.358858 0.029309 -0.031492
Density -0.003588 -0.173099 0.089073 -0.211503 -0.223614 0.033124 0.076763 -0.025319 -0.077688 -0.085575 0.037000 1.000000 -0.133594 -0.183686 -0.207039 0.010409
YearBuilt -0.003125 0.042522 -0.029393 0.049635 0.033473 1.000000 0.017528 0.022763 0.079880 -0.015568 0.039966 0.033124 0.041859 0.053942 0.036480 0.000731
FullBath 0.007991 0.075237 0.261750 0.115947 0.063043 0.017528 1.000000 0.422643 0.104505 -0.214047 0.282449 0.076763 0.054750 0.093242 0.054946 0.014429
ASTotal 0.013636 0.745069 -0.308856 1.000000 0.773210 0.049635 0.115947 0.296385 0.438262 -0.161761 0.252863 -0.211503 0.680784 0.976450 0.737280 0.045667
Log_ASTotal 0.021035 0.731251 -0.358858 0.976450 0.748974 0.053942 0.093242 0.270424 0.440279 -0.132254 0.247498 -0.183686 0.701641 1.000000 0.734685 0.052105
Fireplace 0.072357 0.319281 -0.080985 0.438262 0.314672 0.079880 0.104505 0.138804 1.000000 -0.301008 0.300044 -0.077688 0.299794 0.440279 0.301384 0.058021
Porch 0.091556 0.148568 0.043957 0.252863 0.136709 0.039966 0.282449 0.291061 0.300044 -0.309217 1.000000 0.037000 0.139275 0.247498 0.135821 0.079053
Log_Longevity 0.927041 -0.155813 -0.031492 0.045667 -0.001924 0.000731 0.014429 -0.012412 0.058021 -0.084335 0.079053 0.010409 -0.181865 0.052105 0.002972 1.000000
Longevity 1.000000 -0.282381 -0.037848 0.013636 -0.045015 -0.003125 0.007991 -0.040120 0.072357 -0.121918 0.091556 -0.003588 -0.324790 0.021035 -0.040790 0.927041

Define the model

In [379]:
y, X = patsy.dmatrices('Log_Longevity ~ Log_SaleAmt + TaxRate + Log_ASTotal + Log_LivingSqFt + YearBuilt + Fireplace + CentralAir + Porch', data=df2, return_type="dataframe")
In [380]:
X.head()
Out[380]:
Intercept Log_SaleAmt TaxRate Log_ASTotal Log_LivingSqFt YearBuilt Fireplace CentralAir Porch
558 1.0 11.198215 7.8868 11.336510 7.271704 1949.0 1.0 1.0 0.0
589 1.0 12.691580 8.0742 11.577111 7.477604 1995.0 1.0 1.0 1.0
606 1.0 12.283034 8.9128 11.320869 7.451242 1979.0 1.0 1.0 0.0
607 1.0 11.492723 8.9128 11.343252 7.971776 1979.0 1.0 1.0 1.0
608 1.0 12.301383 8.9128 11.264977 7.614805 1978.0 1.0 1.0 1.0

Standardize

In [381]:
X = StandardScaler().fit_transform(X)

Model Experimentation and Evaluation

Text

In [370]:
y.mean()
Out[370]:
Log_Longevity    8.246999
dtype: float64

Basic Linear Regression

In [371]:
reg = LinearRegression()
scores = cross_val_score(reg, X, y, cv=10, scoring='neg_mean_squared_error')

# scores output is negative, a sklearn quirk bc mse is used to min. optimization func.
print(np.array([math.sqrt(x) for x in -scores]))
[ 0.77782411  0.75733912  0.75356208  0.76513682  0.71110719  0.71484247
  0.71611499  0.73752505  0.76529421  0.76893407]

Basic Polynomial Regression

In [204]:
reg = LinearRegression()
degree=2
est = make_pipeline(PolynomialFeatures(degree), LinearRegression())
scores = cross_val_score(est, X, y, cv=10, scoring='neg_mean_squared_error')

# scores output is negative, a sklearn quirk bc mse is used to min. optimization func.
print(np.array([math.sqrt(x) for x in -scores]))
[  2684.53466593   3598.71157638   2656.24439764   4226.07078147
   8371.45311281   2711.73830043   9575.95944592   4784.46036978
  14569.72086452   3821.61555235]
In [372]:
reg = LinearRegression()
scoring = ['neg_mean_squared_error', 'r2']
scores = cross_validate(reg, X, y, scoring=scoring,cv=10, return_train_score=True)
scores
Out[372]:
{'fit_time': array([ 0.02397871,  0.00995731,  0.01100755,  0.01839995,  0.01383066,
         0.0111587 ,  0.00897789,  0.0065949 ,  0.0133121 ,  0.00609326]),
 'score_time': array([ 0.00095797,  0.0007441 ,  0.00128365,  0.00065398,  0.00082874,
         0.00080156,  0.00058365,  0.00255775,  0.0021584 ,  0.02351403]),
 'test_neg_mean_squared_error': array([-0.60501035, -0.57356254, -0.5678558 , -0.58543436, -0.50567344,
        -0.51099975, -0.51282068, -0.5439432 , -0.58567523, -0.5912596 ]),
 'test_r2': array([ 0.05880326,  0.03201669,  0.06040886,  0.04724393,  0.19900863,
         0.10479881,  0.10876697,  0.18902857,  0.04195548,  0.07636793]),
 'train_neg_mean_squared_error': array([-0.55017119, -0.55376059, -0.554242  , -0.55231256, -0.56147301,
        -0.56051199, -0.56033911, -0.55708868, -0.55240355, -0.55160644]),
 'train_r2': array([ 0.10442918,  0.10717405,  0.10447899,  0.10593538,  0.08691485,
         0.09981387,  0.09931987,  0.08834454,  0.10603413,  0.10297949])}

Feature evaluation

In [224]:
kf = KFold(n_splits=5, shuffle=True)


# create a linear regression model
lm = LinearRegression()
all_scores=[]

# get indices of corresponding train & test
for train,test in kf.split(X):
    x_train=X.iloc[train]
    y_train=y.iloc[train]
    x_test=X.iloc[test]
    y_test=y.iloc[test]

    pvals=[]
    sig_cols=[]

    # within each training set, find the significant variables
    for feature in x_train.columns:
        pval=f_select.f_regression(x_train[[feature]], y_train)
        
        if pval[1][0]<.01: 
            sig_cols.append(feature)
            pvals.append(pval[1][0])
            
    lm.fit(x_train[sig_cols],y_train)
    print(sig_cols)
    
    r_2=lm.score(x_test[sig_cols],y_test)
    all_scores.append(r_2)
    
np.mean(all_scores)
/home/joseph/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/joseph/anaconda3/lib/python3.6/site-packages/sklearn/feature_selection/univariate_selection.py:298: RuntimeWarning: divide by zero encountered in true_divide
  corr /= X_norms
/home/joseph/anaconda3/lib/python3.6/site-packages/sklearn/feature_selection/univariate_selection.py:303: RuntimeWarning: invalid value encountered in true_divide
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
['Log_SaleAmt']
['Log_SaleAmt']
['Log_SaleAmt']
['Log_SaleAmt']
['Log_SaleAmt']
Out[224]:
0.06713711141161946

Ridge Regression

In [346]:
alpha = 0.01
degree=2
est = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=alpha))
scoring = ['neg_mean_squared_error', 'r2','neg_median_absolute_error']
scores = cross_validate(est, X, y, scoring=scoring,cv=10, return_train_score=True)
scores
Out[346]:
{'fit_time': array([ 0.0383575 ,  0.03902268,  0.06465101,  0.02653861,  0.05691075,
         0.02652287,  0.02911663,  0.0263288 ,  0.02759314,  0.02797771]),
 'score_time': array([ 0.01015306,  0.00774145,  0.006042  ,  0.00612688,  0.00551128,
         0.00550604,  0.00663495,  0.00530505,  0.0055716 ,  0.00543451]),
 'test_neg_mean_squared_error': array([-6552866.39607118, -6424976.71627184, -5578946.2813937 ,
        -6474580.13980465, -5497705.1562018 , -5266566.4793487 ,
        -5428840.11774937, -5844798.216485  , -6065522.7262222 ,
        -6085610.5529773 ]),
 'test_neg_median_absolute_error': array([-1884.79022689, -1877.82010082, -1819.83720786, -1951.76222224,
        -1843.86381026, -1403.31834983, -1615.9963883 , -1712.86381034,
        -1894.10590129, -2130.38745273]),
 'test_r2': array([ 0.20104611,  0.13625071,  0.21961964,  0.17697301,  0.44371575,
         0.25684521,  0.26887013,  0.41368829,  0.19955077,  0.19557962]),
 'train_neg_mean_squared_error': array([-5740008.14170122, -5753990.07167607, -5845120.07213862,
        -5747445.12399782, -5864026.58673497, -5882167.10136222,
        -5863522.26334443, -5819996.0980089 , -5791260.61064104,
        -5788396.36766685]),
 'train_neg_median_absolute_error': array([-1762.18769362, -1754.6132306 , -1794.2406224 , -1756.61421128,
        -1805.40114898, -1814.82995354, -1790.4398609 , -1803.53573895,
        -1769.90678763, -1753.67852041]),
 'train_r2': array([ 0.28719153,  0.29311568,  0.28475377,  0.28980854,  0.25216487,
         0.28102361,  0.27999124,  0.25765095,  0.28689218,  0.28775973])}
In [349]:
scores['test_neg_median_absolute_error'].mean()/365
Out[349]:
-4.9684234165931986

Lasso Regression

In [307]:
alpha = 100
degree=2
est = make_pipeline(PolynomialFeatures(degree), Lasso(alpha=alpha))
scoring = ['neg_mean_squared_error', 'r2']
scores = cross_validate(est, X, y, scoring=scoring,cv=10, return_train_score=True)
scores
/home/joseph/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/joseph/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/joseph/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/joseph/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/joseph/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/joseph/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/joseph/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/joseph/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/joseph/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/joseph/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Out[307]:
{'fit_time': array([ 0.02560949,  0.02187872,  0.02094412,  0.02065277,  0.0207479 ,
         0.03086805,  0.02326274,  0.04340053,  0.02811837,  0.02141428]),
 'score_time': array([ 0.00620604,  0.00142932,  0.00143504,  0.00993085,  0.00158286,
         0.00256944,  0.00166798,  0.0014298 ,  0.00157762,  0.00170541]),
 'test_neg_mean_squared_error': array([-7151748.83656557, -5568487.2555249 , -3964787.90494425,
        -5447854.63451336, -6599747.10516303, -5521967.04732249,
        -9062455.18253427, -6480922.76869424, -6265248.45679911,
        -6606277.41861003]),
 'test_r2': array([ 0.26184413,  0.25941534, -0.25999288, -0.52640713,  0.13502402,
        -0.30981554,  0.22594138, -0.01427154,  0.13464504,  0.19432266]),
 'train_neg_mean_squared_error': array([-5906716.34895177, -6063374.22698716, -6249906.41256639,
        -6112959.25344711, -5953526.16339988, -6073457.17191973,
        -5684282.27855621, -5969932.78812524, -6008794.39396149,
        -5960969.4204715 ]),
 'train_r2': array([ 0.1531144 ,  0.16343346,  0.18533985,  0.19092769,  0.17629505,
         0.19639109,  0.15743029,  0.18504407,  0.17472741,  0.16960665])}

Bias and Variance Evaluation

Homespun

First, my own attempt at developing this plot...

In [250]:
def sample_test_train(N,k,size,seed=False):
    ''' Create a set of train/test samples.
        The goal here is random sets of a certain size.
        There is no intention of these sets being non-overlapping.
        Size is same for both test and train unless the requested
        size is more than half of the N provided.
        Input:
            N    the size of the data
            k    the number or samples desired
            size the size of the samples desired
        Output:
            List of pairs of list of indices
    '''
    if seed:
        np.seed(seed)
    output = []
    for i in range(k):
        indices = np.random.permutation(N)
        output.append([indices[:size],indices[size:min(size*2,N)]])
    return output
In [266]:
def learning_plot(est,X,y,points=10,rep=10):
    """ Create a plot to visualize the learning of the model.
    Inputs:
        est    the estimator
        X      the features
        y      the target
        points the number of tests
        rep    the number of repetitions at each test size
    """
    x = np.arange(points)
    train_err = np.array([])
    test_err = np.array([])
    N = len(X)
    sizes = [math.ceil(0.9 * N * (1 + i)/points) for i in x]
    for size in sizes:
        train = np.array([])
        test = np.array([])
        for train_ind,test_ind in sample_test_train(len(X),rep,size):
            X_train = X.iloc[train_ind]
            X_test  = X.iloc[test_ind]
            y_train = y.iloc[train_ind]
            y_test  = y.iloc[test_ind]
            est.fit(X.iloc[train_ind],y.iloc[train_ind])
            train = np.append(train,
                metrics.mean_squared_error(
                    est.predict(X.iloc[train_ind]),
                    y.iloc[train_ind]
            ))
            test = np.append(test,
                metrics.mean_squared_error(
                    est.predict(X.iloc[test_ind]),
                    y.iloc[test_ind]
            ))
        train_err = np.append(train_err,train.mean())
        test_err = np.append(test_err,test.mean())
    plt.figure(figsize=(8,6))
    plt.scatter(sizes, train_err,c='b')
    plt.scatter(sizes, test_err,c='r')
    
In [276]:
est = make_pipeline(LinearRegression())
learning_plot(est,X,y,20,100)

Using sklearn

In [291]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.4,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.4, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt
In [338]:
est = make_pipeline(LinearRegression())

plot_learning_curve(
    est,
    "Learning Curves (Linear Regression)",
    X[:1000], y[:1000],
    cv = ShuffleSplit(n_splits=100, test_size=0.2),
    n_jobs=4,
    #ylim=(0.7, 1.01)
)
Out[338]:
<module 'matplotlib.pyplot' from '/home/joseph/anaconda3/lib/python3.6/site-packages/matplotlib/pyplot.py'>
In [337]:
est = make_pipeline(LinearRegression())

plot_learning_curve(
    est,
    "Learning Curves (Linear Regression)",
    X, y,
    cv = ShuffleSplit(n_splits=100, test_size=0.2),
    n_jobs=4,
    #ylim=(0.7, 1.01)
)
Out[337]:
<module 'matplotlib.pyplot' from '/home/joseph/anaconda3/lib/python3.6/site-packages/matplotlib/pyplot.py'>
In [384]:
alpha = 0.01
degree=2
est = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=alpha))
plot_learning_curve(
    est,
    "Learning Curves (Ridge Regression)",
    X, y,
    cv = ShuffleSplit(n_splits=100, test_size=0.2),
    n_jobs=4,
    #ylim=(0.7, 1.01)
)
Out[384]:
<module 'matplotlib.pyplot' from '/home/joseph/anaconda3/lib/python3.6/site-packages/matplotlib/pyplot.py'>

Check for Homoscedasticity

In [373]:
est = make_pipeline(LinearRegression())
# Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(X[:1000], y[:1000], test_size=0.2)

visualizer = ResidualsPlot(est)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.ax.get_yaxis().set_ticks([])
g = visualizer.poof()             # Draw/show/poof the data
In [313]:
alpha = 0.01
degree=2
est = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=alpha))
# Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

visualizer = ResidualsPlot(est)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
g = visualizer.poof()             # Draw/show/poof the data